The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
● All the features are geometric features extracted from the silhouette. ● All are numeric in nature
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
#Import all the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#load the csv file vehicle.csv
df = pd.read_csv('vehicle.csv')
df.head()
#Size of the data
df.shape
# Data types of the fields
df.dtypes
# Fields info
df.info()
df.isna().sum()
#List all the rows having missing value in any of the single or multiple columns
missing_values_cols=df.columns[df.isnull().any()]
df[df.isnull().any(axis=1)][missing_values_cols].head()
df[df.isnull().any(axis=1)][missing_values_cols].shape
There are total 33 rows with missng values in one or more of 14 columns.
#display 5 point summary
df.describe().transpose()
# Distribution of the data group by class variable
df['class'].value_counts()
pd.value_counts(df["class"]).plot(kind="bar")
Note: Cars are almost double in number as compared to bus and van. van is least in number
columns = list(df.drop(["class"],axis=1))[0:-1]
df[columns].hist(bins=10, figsize=(15,30), layout=(8,3));
plt.figure(figsize= (50,50))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)
i=1
for feature in df.drop(["class"],axis=1).columns: # Loop through all columns in the dataframe
plt.subplot(8,3,i)
b = sns.boxplot(x= df[feature])
b.set_xlabel(feature,fontsize=30)
b.tick_params(labelsize=30)
i=i+1
# Let us take logaritmic transform for the below attributes which has high outliers.
# pr.axis_aspect_ratio
# max.length_aspect_ratio
# scaled_radius_of_gyration.1
df['pr.axis_aspect_ratio'] = np.log(df['pr.axis_aspect_ratio'])
df['max.length_aspect_ratio'] = np.log(df['max.length_aspect_ratio'])
df['scaled_radius_of_gyration.1'] = np.log(df['scaled_radius_of_gyration.1'])
df.head()
# Now we will impute the missing valkues with mean of respective fields
df.fillna(df.median(),axis=0,inplace=True)
df.isna().sum()
Null values are successfully replaced with the median values of each field.
##Scale the data using zscore
df_scaled =df.drop(labels='class', axis=1).apply(zscore)
df_scaled['class']=df['class']
df_scaled.head()
df_scaled.corr(method='kendall')
sns.pairplot(df,diag_kind='kde')
corr = abs(df_scaled.corr(method='kendall')) # correlation matrix
lower_triangle = np.tril(corr, k = -1) # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (15,8)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, linewidths= 1, mask = mask) # Da Heatmap
plt.xticks(rotation = 50) # Aesthetic purposes
plt.yticks(rotation = 20) # Aesthetic purposes
plt.show()
#Split into Train -Test set
df_train, df_test= train_test_split(df_scaled, test_size=0.3, random_state=123)
df_train.shape, df_test.shape
# Divide train and test set into feature and target sets
X_train=df_train.drop(labels='class', axis=1)
y_train=df_train['class']
X_test=df_test.drop(labels='class', axis=1)
y_test=df_test['class']
X_train.shape,y_train.shape, X_test.shape, y_test.shape
# Building a Support Vector Machine on train data
# svc_model = SVC(gamma = 'auto', kernel= 'poly', degree=1)
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
# Calculate the acuracy on test dataseusing acuracy_score function of metrics
metrics.accuracy_score(y_pred,y_test)
# Distribution of the test data group by class variable
y_test.value_counts()
# Confusion Matrix
cm2 = confusion_matrix(pd.array(y_test), y_pred, labels=[ "car","bus", "van"])
cm2
# K-fold cross validation using svc_model
num_folds = 10
seed = 7
X = df_scaled.drop(labels='class', axis=1)
Y = df_scaled['class']
kfold = KFold(n_splits=num_folds, random_state=seed)
svc_kfold = SVC()
results = cross_val_score(svc_kfold,X,Y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
print("95 Percent Confidence Accuracy Range is Between: %.3f%% and %.3f%%" % ((results.mean()*100.0 -(1.96 * results.std()*100.0)),(results.mean()*100.0 +(1.96 * results.std()*100.0))))
Note: We can observe some growth in the accuracy percent using K-fold cross validation
# Check for duplicate data
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
df[dups]
# Create a covariance matrix for identifying Principal components
cov_matrix = np.cov(df_scaled.drop(labels='class', axis=1).T)
print('Covariance Matrix \n%s', cov_matrix)
print(cov_matrix.shape)
#Get eigen values and eigen vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
print("Eigen Values:")
pd.DataFrame(eig_vals).transpose()
#Find variance and cumulative variance by each eigen vector
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp) # array of size = as many PC dimensions
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(15 , 6))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
First 7 principal components explain more than 96.11% of the variance in the data.
# Use PCA command from sklearn and find Principal Components. Transform data to components formed
# NOTE - we are generating only 7 PCA dimensions (dimensionality reduction from 18 to 7)
pca = PCA(n_components=7)
data_reduced = pca.fit_transform(df_scaled.drop(labels='class', axis=1))
data_reduced.transpose()
print(data_reduced.shape)
print(data_reduced)
print(pca.explained_variance_)
print(pca.components_.shape)
pca.components_
# Find correlation between components and features
df_comp = pd.DataFrame(pca.components_,columns=list(X_train))
df_comp
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)
Note:
print(df_scaled.shape)
df_scaled.head()
##Transform original dataframe to Principal Data Components
df_scaled_pca = pd.DataFrame(pca.transform(df_scaled.drop(labels='class', axis=1)))
print(df_scaled_pca.shape)
df_scaled_pca['class']=df_scaled['class']
df_scaled_pca.head()
#Split into Principal component data to Train -Test set
df_train_pca, df_test_pca= train_test_split(df_scaled_pca, test_size=0.3, random_state=123)
df_train_pca.shape, df_test_pca.shape
# Divide train and test set into feature and target sets
X_train_pca=df_train_pca.drop(labels='class', axis=1)
y_train_pca=df_train_pca['class']
X_test_pca=df_test_pca.drop(labels='class', axis=1)
y_test_pca=df_test_pca['class']
X_train_pca.shape,y_train_pca.shape, X_test_pca.shape, y_test_pca.shape
# Building a Support Vector Machine on train data
# svc_model_pca = SVC(gamma = 'auto', kernel= 'poly', degree=1)
svc_model_pca = SVC()
svc_model_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = svc_model_pca.predict(X_test_pca)
# check the accuracy on the training set
print(svc_model_pca.score(X_train_pca, y_train_pca))
# Calculate the acuracy on test dataseusing acuracy_score function of metrics
metrics.accuracy_score(y_pred_pca,y_test_pca)
# Distribution of the test data group by class variable
y_test_pca.value_counts()
# Confusion Matrix
cm2 = confusion_matrix(pd.array(y_test_pca), y_pred_pca, labels=[ "car","bus", "van"])
cm2
# K-fold cross validation using svc_model
num_folds = 10
seed = 7
X = df_scaled_pca.drop(labels='class', axis=1)
Y = df_scaled_pca['class']
kfold_pca = KFold(n_splits=num_folds, random_state=seed)
svc_pca_kfold = SVC()
results_pca = cross_val_score(svc_pca_kfold,X,Y, cv=kfold_pca)
print(results_pca)
print("Accuracy: %.3f%% (%.3f%%)" % (results_pca.mean()*100.0, results_pca.std()*100.0))
print("95 Percent Confidence Accuracy Range is Between: %.3f%% and %.3f%%" % ((results_pca.mean()*100.0 -(1.96 * results_pca.std()*100.0)),(results_pca.mean()*100.0 +(1.96 * results_pca.std()*100.0))))
# Comparing the accuracy scores of train and test test data for both original and PCA data
print("Original Trained data accuracy = ", svc_model.score(X_train, y_train))
print("PCA Trained data accuracy = ",svc_model_pca.score(X_train_pca, y_train_pca))
print("-----")
print("Original test data accuracy = ",metrics.accuracy_score(y_pred,y_test))
print("PCA test data accuracy = ",metrics.accuracy_score(y_pred_pca,y_test_pca))
print("-----")
print("Original data cross validation score = ",results.mean()*100.0)
print("PCA data cross validation score = ",results_pca.mean()*100.0)
print("Original Data 95 Percent Confidence Accuracy Range is Between: %.3f%% and %.3f%%" % ((results.mean()*100.0 -(1.96 * results.std()*100.0)),(results.mean()*100.0 +(1.96 * results.std()*100.0))))
print("PCA Data 95 Percent Confidence Accuracy Range is Between: %.3f%% and %.3f%%" % ((results_pca.mean()*100.0 -(1.96 * results_pca.std()*100.0)),(results_pca.mean()*100.0 +(1.96 * results_pca.std()*100.0))))
But every thing has two sides, disadvantage of pca is we cannot do interpretation with the model. it's blackbox.
Considering the K-Fold cross validation score for original data it is 96.69% where as it is 92.67% for PCA data with same number of 10 folds. This means in production the range of accuracy with 95% confidence is between 93.442% and 99.933% with original data. For PCA data the 95% confidence accuracy range is between 87.935% and 97.407%.